/**
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
**/
/**
This file is part of Save For Offline, an Android app which saves / downloads complete webpages for offine reading.
**/
/**
If you modify, redistribute, or write something based on this or parts of it, you MUST,
I repeat, you MUST comply with the GPLv2+ license. This means that if you use or modify
my code, you MUST release the source code of your modified version, if / when this is
required under the terms of the license.
If you cannot / do not want to do this, DO NOT USE MY CODE. Thanks.
(I've added this message to to the source because it's been used in severeral proprietary
closed source apps, which I don't want, and which is also a violation of the liense.)
**/
/**
Written by Jonas Czech (JonasCz, stackoverflow.com/users/4428462/JonasCz and github.com/JonasCz) originally and partially based on https://github.com/PramodKhare/GetMeThatPage/
with lots of improvements. (4428462jonascz/eafc4d1afq)
**/
package jonas.tool.saveForOffline;
import com.squareup.okhttp.Cache;
import com.squareup.okhttp.OkHttpClient;
import com.squareup.okhttp.Request;
import com.squareup.okhttp.Response;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Entities;
import org.jsoup.select.Elements;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.SynchronousQueue;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class PageSaver {
private EventCallback eventCallback;
private OkHttpClient client = new OkHttpClient();
private final String HTTP_REQUEST_TAG = "TAG";
private boolean isCancelled = false;
private Options options = new Options();
// filesToGrab - maintains all the links to files (eg images, scripts) which we are going to grab/download
private List<String> filesToGrab = new ArrayList<String>();
//framesToGrab - list of html frame files to download, as we parse these recursively
private List<String> framesToGrab = new ArrayList<String>();
//cssToGrab - list of all css files to download and parse, these need to be parsed to extract urls
private List<String> cssToGrab = new ArrayList<String>();
private String title = "";
private String pageIconUrl = "";
private String indexFileName = "index.html";
private final Pattern fileNameReplacementPattern = Pattern.compile("[^a-zA-Z0-9-_\\.]");
public Options getOptions() {
return this.options;
}
public String getPageTitle () {
return this.title;
}
public PageSaver(EventCallback callback) {
this.eventCallback = callback;
client.setConnectTimeout(20, TimeUnit.SECONDS);
client.setReadTimeout(20, TimeUnit.SECONDS);
client.setWriteTimeout(20, TimeUnit.SECONDS);
client.setFollowRedirects(true);
client.setFollowSslRedirects(true);
}
public void cancel() {
this.isCancelled = true;
client.cancel(HTTP_REQUEST_TAG);
}
public void resetState () {
filesToGrab.clear();
framesToGrab.clear();
cssToGrab.clear();
title = "";
pageIconUrl = "";
isCancelled = false;
}
public boolean isCancelled () {
return this.isCancelled;
}
public boolean getPage(String url, String outputDirPath, String indexFilename) {
this.indexFileName = indexFilename;
File outputDir = new File(outputDirPath);
if (!outputDir.exists() && outputDir.mkdirs() == false) {
eventCallback.onFatalError(new IOException("File " + outputDirPath + "could not be created"), url);
return false;
}
//download main html and parse -- isExtra parameter should be false
boolean success = downloadHtmlAndParseLinks(url, outputDirPath, false);
if (isCancelled || !success) {
return false;
}
//download and parse html frames - use iterator because our list may be modified as frames can contain other frames
for (Iterator<String> i = framesToGrab.iterator(); i.hasNext();) {
downloadHtmlAndParseLinks(i.next(), outputDirPath, true);
if (isCancelled) return true;
}
//download and parse css files
for (Iterator<String> i = cssToGrab.iterator(); i.hasNext();) {
if (isCancelled) return true;
downloadCssAndParse(i.next(), outputDirPath);
}
ThreadPoolExecutor pool = new ThreadPoolExecutor(Runtime.getRuntime().availableProcessors(), Runtime.getRuntime().availableProcessors(), 60, TimeUnit.SECONDS, new BlockingDownloadTaskQueue<Runnable>());
for (Iterator<String> i = filesToGrab.iterator(); i.hasNext();) {
if (isCancelled) {
eventCallback.onProgressMessage("Cancelling...");
shutdownExecutor(pool, 10, TimeUnit.SECONDS);
return success;
}
String urlToDownload = i.next();
eventCallback.onProgressMessage("Saving file: " + getFileName(urlToDownload));
eventCallback.onProgressChanged(filesToGrab.indexOf(urlToDownload), filesToGrab.size(), false);
pool.submit(new DownloadTask(urlToDownload, outputDir));
}
pool.submit(new DownloadTask(pageIconUrl, outputDir, "saveForOffline_icon.png"));
eventCallback.onProgressMessage("Finishing file downloads...");
shutdownExecutor(pool, 60, TimeUnit.SECONDS);
return success;
}
private boolean downloadHtmlAndParseLinks(final String url, final String outputDir, final boolean isExtra) {
//isExtra should be true when saving a html frame file.
String filename;
if (isExtra) {
filename = getFileName(url);
} else {
filename = indexFileName;
}
String baseUrl = url;
if (url.endsWith("/")) {
baseUrl = url + filename;
}
try {
eventCallback.onProgressMessage(isExtra ? "Getting HTML frame file: " + filename : "Getting main HTML file");
String htmlContent = getStringFromUrl(url);
eventCallback.onProgressMessage(isExtra ? "Processing HTML frame file: " + filename: "Processing main HTML file");
htmlContent = parseHtmlForLinks(htmlContent, baseUrl);
eventCallback.onProgressMessage(isExtra ? "Saving HTML frame file: " + filename: "Saving main HTML file");
File outputFile = new File(outputDir, filename);
saveStringToFile(htmlContent, outputFile);
return true;
} catch (IOException | IllegalStateException e) {
if (isExtra) {
eventCallback.onError(e);
} else {
eventCallback.onFatalError(e, url);
}
e.printStackTrace();
return false;
}
}
private void downloadCssAndParse(final String url, final String outputDir) {
String filename = getFileName(url);
File outputFile = new File(outputDir, filename);
try {
eventCallback.onProgressMessage("Getting CSS file: " + filename);
String cssContent = getStringFromUrl(url);
eventCallback.onProgressMessage("Processing CSS file: " + filename);
cssContent = parseCssForLinks(cssContent, url);
eventCallback.onProgressMessage("Saving CSS file: " + filename);
saveStringToFile(cssContent, outputFile);
} catch (IOException e) {
eventCallback.onError(e);
e.printStackTrace();
}
}
private class DownloadTask implements Runnable {
private String url;
private File outputDir;
private String fileName;
public DownloadTask(String url, File toPath) {
this.url = url;
this.outputDir = toPath;
}
public DownloadTask(String url, File toPath, String fileName) {
this.url = url;
this.outputDir = toPath;
this.fileName = fileName;
}
@Override
public void run() {
if (fileName == null) {
fileName = getFileName(url);
}
File outputFile = new File(outputDir, fileName);
Request request = new Request.Builder()
.url(url)
.addHeader("User-Agent", getOptions().getUserAgent())
.tag(HTTP_REQUEST_TAG)
.build();
try {
Response response = client.newCall(request).execute();
InputStream is = response.body().byteStream();
FileOutputStream fos = new FileOutputStream(outputFile);
final byte[] buffer = new byte[1024 * 32]; // read in batches of 32K
int length;
while ((length = is.read(buffer)) != -1) {
fos.write(buffer, 0, length);
}
response.body().close();
fos.flush();
fos.close();
is.close();
} catch (IllegalArgumentException | IOException e) {
IOException ex = new IOException("File download failed, URL: " + url + ", Output file path: " + outputFile.getPath());
if (isCancelled) {
ex.initCause(new IOException("Save was cancelled, isCancelled is true").initCause(e));
eventCallback.onError(ex);
} else {
eventCallback.onError(ex.initCause(e));
}
}
}
}
private String getStringFromUrl(String url) throws IOException, IllegalStateException {
Request request = new Request.Builder()
.url(url)
.addHeader("User-Agent", getOptions().getUserAgent())
.tag(HTTP_REQUEST_TAG)
.build();
Response response = client.newCall(request).execute();
String out = response.body().string();
response.body().close();
return out;
}
private void saveStringToFile(String ToSave, File outputFile) throws IOException {
if (outputFile.exists()) {
return;
}
outputFile.createNewFile();
FileOutputStream fos = new FileOutputStream(outputFile);
fos.write(ToSave.getBytes());
fos.flush();
fos.close();
}
private String parseHtmlForLinks(String htmlToParse, String baseUrl) {
//get all links from this webpage and add them to LinksToVisit ArrayList
Document document = Jsoup.parse(htmlToParse, baseUrl);
document.outputSettings().escapeMode(Entities.EscapeMode.extended);
if (title.isEmpty()) {
title = document.title();
eventCallback.onPageTitleAvailable(title);
}
if (pageIconUrl.isEmpty()) {
eventCallback.onProgressMessage("Getting icon...");
pageIconUrl = FaviconFetcher.getInstance().getFaviconUrl(document);
}
eventCallback.onProgressMessage("Processing HTML...");
String urlToGrab;
Elements links;
if (getOptions().saveFrames()) {
links = document.select("frame[src]");
eventCallback.onLogMessage("Got " + links.size() + " frames");
for (Element link : links) {
urlToGrab = link.attr("abs:src");
addLinkToList(urlToGrab, framesToGrab);
String replacedURL = getFileName(urlToGrab);
link.attr("src", replacedURL);
}
links = document.select("iframe[src]");
eventCallback.onLogMessage("Got " + links.size() + " iframes");
for (Element link : links) {
urlToGrab = link.attr("abs:src");
addLinkToList(urlToGrab, framesToGrab);
String replacedURL = getFileName(urlToGrab);
link.attr("src", replacedURL);
}
}
if (getOptions().saveOther()) {
// Get all the links
links = document.select("link[href]");
eventCallback.onLogMessage("Got " + links.size() + " link elements with a href attribute");
for (Element link : links) {
urlToGrab = link.attr("abs:href");
//if it is css, parse it later to extract urls (images referenced from "background" attributes for example)
if (link.attr("rel").equals("stylesheet")) {
cssToGrab.add(link.attr("abs:href"));
} else {
addLinkToList(urlToGrab, filesToGrab);
}
String replacedURL = getFileName(urlToGrab);
link.attr("href", replacedURL);
}
//get links in embedded css also, and modify the links to point to local files
links = document.select("style[type=text/css]");
eventCallback.onLogMessage("Got " + links.size() + " embedded stylesheets, parsing CSS");
for (Element link : links) {
String cssToParse = link.data();
String parsedCss = parseCssForLinks(cssToParse, baseUrl);
if (link.dataNodes().size() != 0) {
link.dataNodes().get(0).setWholeData(parsedCss);
}
}
//get input types with an image type
links = document.select("input[type=image]");
eventCallback.onLogMessage("Got " + links.size() + " input elements with type = image");
for (Element link : links) {
urlToGrab = link.attr("abs:src");
addLinkToList(urlToGrab, filesToGrab);
String replacedURL = getFileName(urlToGrab);
link.attr("src", replacedURL);
}
//get everything which has a background attribute
links = document.select("[background]");
eventCallback.onLogMessage("Got " + links.size() + " elements with a background attribute");
for (Element link : links) {
urlToGrab = link.attr("abs:src");
addLinkToList(urlToGrab, filesToGrab);
String replacedURL = getFileName(urlToGrab);
link.attr("src", replacedURL);
}
links = document.select("[style]");
eventCallback.onLogMessage("Got " + links.size() + " elements with a style attribute, parsing CSS");
for (Element link : links) {
String cssToParse = link.attr("style");
String parsedCss = parseCssForLinks(cssToParse, baseUrl);
link.attr("style", parsedCss);
}
}
if (getOptions().saveScripts()) {
links = document.select("script[src]");
eventCallback.onLogMessage("Got " + links.size() + " script elements");
for (Element link : links) {
urlToGrab = link.attr("abs:src");
addLinkToList(urlToGrab, filesToGrab);
String replacedURL = getFileName(urlToGrab);
link.attr("src", replacedURL);
}
}
if (getOptions().saveImages()) {
links = document.select("img[src]");
eventCallback.onLogMessage("Got " + links.size() + " image elements");
for (Element link : links) {
urlToGrab = link.attr("abs:src");
addLinkToList(urlToGrab, filesToGrab);
String replacedURL = getFileName(urlToGrab);
link.attr("src", replacedURL);
link.removeAttr("srcset"); //we don't use this for now, so remove it.
}
links = document.select("img[data-canonical-src]");
eventCallback.onLogMessage("Got " + links.size() + " image elements, w. data-canonical-src");
for (Element link : links) {
urlToGrab = link.attr("abs:data-canonical-src");
addLinkToList(urlToGrab, filesToGrab);
String replacedURL = getFileName(urlToGrab);
link.attr("data-canonical-src", replacedURL);
link.removeAttr("srcset"); //we don't use this for now, so remove it.
}
}
if (getOptions().saveVideo()) {
//video src is sometimes in a child element
links = document.select("video:not([src])");
eventCallback.onLogMessage("Got " + links.size() + " video elements without src attribute");
for (Element link : links.select("[src]")) {
urlToGrab = link.attr("abs:src");
addLinkToList(urlToGrab, filesToGrab);
String replacedURL = getFileName(urlToGrab);
link.attr("src", replacedURL);
}
links = document.select("video[src]");
eventCallback.onLogMessage("Got " + links.size() + " video elements");
for (Element link : links) {
urlToGrab = link.attr("abs:src");
addLinkToList(urlToGrab, filesToGrab);
String replacedURL = getFileName(urlToGrab);
link.attr("src", replacedURL);
}
}
if (getOptions().makeLinksAbsolute()) {
//make links absolute, so they are not broken
links = document.select("a[href]");
eventCallback.onLogMessage("Making " + links.size() + " links absolute");
for (Element link : links) {
String absUrl = link.attr("abs:href");
link.attr("href", absUrl);
}
}
return document.outerHtml();
}
private String parseCssForLinks(String cssToParse, String baseUrl) {
String patternString = "url(\\s*\\(\\s*['\"]*\\s*)(.*?)\\s*['\"]*\\s*\\)"; //I hate regexes...
Pattern pattern = Pattern.compile(patternString);
Matcher matcher = pattern.matcher(cssToParse);
eventCallback.onLogMessage("Parsing CSS");
//find everything inside url(" ... ")
while (matcher.find()) {
if (matcher.group().replaceAll(patternString, "$2").contains("/")) {
cssToParse = cssToParse.replace(matcher.group().replaceAll(patternString, "$2"), getFileName(matcher.group().replaceAll(patternString, "$2")));
}
addLinkToList(matcher.group().replaceAll(patternString, "$2").trim(), baseUrl, filesToGrab);
}
// find css linked with @import - needs testing
//todo: test this to see if it actually works
String importString = "@(import\\s*['\"])()([^ '\"]*)";
pattern = Pattern.compile(importString);
matcher = pattern.matcher(cssToParse);
matcher.reset();
while (matcher.find()) {
if (matcher.group().replaceAll(patternString, "$2").contains("/")) {
cssToParse = cssToParse.replace(matcher.group().replaceAll(patternString, "$2"), getFileName(matcher.group().replaceAll(patternString, "$2")));
}
addLinkToList(matcher.group().replaceAll(patternString, "$2").trim(), baseUrl, cssToGrab);
}
return cssToParse;
}
private boolean isLinkValid (String url) {
if (url == null || url.length() == 0) {
return false;
} else if (!url.startsWith("http")) {
return false;
} else {
return true;
}
}
private void addLinkToList(String link, List<String> list) {
if (!isLinkValid(link) || list.contains(link)) {
return;
} else {
list.add(link);
}
}
private void addLinkToList(String link, String baseUrl, List<String> list) {
if (link.startsWith("data:image")) {
return;
}
try {
URL u = new URL(new URL(baseUrl), link);
link = u.toString();
} catch (MalformedURLException e) {
return;
}
if (!isLinkValid(link) || list.contains(link)) {
return;
} else {
list.add(link);
}
}
private String getFileName(String url) {
String filename = url.substring(url.lastIndexOf('/') + 1);
if (filename.trim().length() == 0) {
filename = String.valueOf(url.hashCode());
}
if (filename.contains("?")) {
filename = filename.substring(0, filename.indexOf("?")) + filename.substring(filename.indexOf("?") + 1).hashCode();
}
filename = fileNameReplacementPattern.matcher(filename).replaceAll("_");
filename = filename.substring(0, Math.min(200, filename.length()));
;
return filename;
}
private void shutdownExecutor (ExecutorService e, int waitTime, TimeUnit waitTimeUnit) {
e.shutdown();
try {
if (!e.awaitTermination(waitTime, waitTimeUnit)) {
eventCallback.onError("Executor pool did not termimate after " + waitTime + " " + waitTimeUnit.toString() +", doing shutdownNow()");
e.shutdownNow();
}
} catch (InterruptedException ie) {
eventCallback.onError(ie);
}
}
private class BlockingDownloadTaskQueue<E> extends SynchronousQueue<E> {
public BlockingDownloadTaskQueue () {
super();
}
@Override
public boolean offer (E e) {
try {
put(e);
return true;
} catch (InterruptedException ie) {
Thread.currentThread().interrupt();
eventCallback.onError(ie);
return false;
}
}
}
class Options {
private boolean makeLinksAbsolute = true;
private boolean saveImages = true;
private boolean saveFrames = true;
private boolean saveOther = true;
private boolean saveScripts = true;
private boolean saveVideo = false;
private String userAgent = " ";
public void setCache (File cacheDirectory, long maxCacheSize) {
Cache cache = (new Cache(cacheDirectory, maxCacheSize));
client.setCache(cache);
}
public void clearCache() throws IOException {
client.getCache().evictAll();
}
public String getUserAgent() {
return userAgent;
}
public void setUserAgent(final String userAgent) {
this.userAgent = userAgent;
}
public boolean makeLinksAbsolute() {
return makeLinksAbsolute;
}
public void makeLinksAbsolute(final boolean makeLinksAbsolute) {
this.makeLinksAbsolute = makeLinksAbsolute;
}
public boolean saveImages() {
return saveImages;
}
public void saveImages(final boolean saveImages) {
this.saveImages = saveImages;
}
public boolean saveFrames() {
return saveFrames;
}
public void saveFrames(final boolean saveFrames) {
this.saveFrames = saveFrames;
}
public boolean saveScripts() {
return saveScripts;
}
public void saveScripts(final boolean saveScripts) {
this.saveScripts = saveScripts;
}
public boolean saveOther() {
return saveOther;
}
public void saveOther(final boolean saveOther) {
this.saveOther = saveOther;
}
public boolean saveVideo() {
return saveVideo;
}
public void saveVideo(final boolean saveVideo) {
this.saveVideo = saveVideo;
}
}
}
interface EventCallback {
public void onProgressChanged(int progress, int maxProgress, boolean indeterminate);
public void onProgressMessage(String fileName);
public void onPageTitleAvailable (String pageTitle);
public void onLogMessage (String message);
public void onError(Throwable error);
public void onError(String errorMessage);
public void onFatalError (Throwable error, String pageUrl);
}